%%capture
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
import numpy as np
import pandas as pd
import altair as alt
import geopandas as gpd
import json
rw_funding = pd.read_csv('drive/My Drive/Data Visualization/ryan_white_funding.csv')
#drop the totals row, which we don't want to display in any of our charts
rw_funding = rw_funding[rw_funding['ent Name'] != 'TOTALS']
#A new data source I added to normalize funding over population
state_pop = pd.read_csv('drive/My Drive/Data Visualization/census_pop_2019.csv')
merge_fp = pd.merge(rw_funding, state_pop, how='inner', left_on="ent Name", right_on="NAME")
#Drop unwanted population columns
merge_fp.drop(['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015','POPESTIMATE2016', 'POPESTIMATE2017', 'POPESTIMATE2018','NPOPCHG_2010', 'NPOPCHG_2011', 'NPOPCHG_2012',
'NPOPCHG_2013', 'NPOPCHG_2014', 'NPOPCHG_2015', 'NPOPCHG_2016', 'NPOPCHG_2017', 'NPOPCHG_2018', 'NPOPCHG_2019', 'PPOPCHG_2010','PPOPCHG_2011', 'PPOPCHG_2012', 'PPOPCHG_2013', 'PPOPCHG_2014',
'PPOPCHG_2015', 'PPOPCHG_2016', 'PPOPCHG_2017', 'PPOPCHG_2018','PPOPCHG_2019', 'NRANK_ESTBASE2010', 'NRANK_POPEST2010',
'NRANK_POPEST2011', 'NRANK_POPEST2012', 'NRANK_POPEST2013','NRANK_POPEST2014', 'NRANK_POPEST2015', 'NRANK_POPEST2016',
'NRANK_POPEST2017', 'NRANK_POPEST2018', 'NRANK_POPEST2019','NRANK_NPCHG2010', 'NRANK_NPCHG2011', 'NRANK_NPCHG2012','NRANK_NPCHG2013', 'NRANK_NPCHG2014', 'NRANK_NPCHG2015','NRANK_NPCHG2016', 'NRANK_NPCHG2017', 'NRANK_NPCHG2018',
'NRANK_NPCHG2019', 'NRANK_PPCHG2010', 'NRANK_PPCHG2011','NRANK_PPCHG2012', 'NRANK_PPCHG2013', 'NRANK_PPCHG2014','NRANK_PPCHG2015', 'NRANK_PPCHG2016', 'NRANK_PPCHG2017','NRANK_PPCHG2018', 'NRANK_PPCHG2019'],axis=1,inplace=True)
#Create per capita funding column, to use in later visualization
merge_fp['funding_pc'] = merge_fp['Total Part B Funding']/merge_fp['POPESTIMATE2019']
#Add some additional colummns to create more graphs
#Grab quantiles for funding
merge_fp['Total Part B Funding'].quantile([.1,.9])
merge_fp['quartile'] = np.where(merge_fp['Total Part B Funding'] >= 45358901.9, 'top_10', 'middle')
merge_fp['quartile'] = np.where(merge_fp['Total Part B Funding'] <= 1335360.6, 'bottom_10', merge_fp['quartile'])
def my_theme():
font = 'Slab serif'
return {
'config': {
'background': '#FFFFFF',
'view': {
'height': 250,
'width': 250,
},
'mark': {
'color': '#5F9EA0',
'labelFont': font,
'labelfontSize': 12
},
'title': {
'font': font,
'fontSize': 17,
'subtitleFont': font
},
'axis': {
'labelFont': font,
'titleFont': font,
'labelFontSize': 12,
'titleFontSize': 14,
'grid': False
},
'range': {
'heatmap': ['#F8F8FF','#DC143C'],
'category': ["#5F9EA0", "#666699",'#FF7F50', "#B0C4DE", '#FFD700', '#DC143C','#008000']
},
'legend': {
'titleFont': font,
'labelFont': font,
}
}
}
# register the custom theme under a chosen name
alt.themes.register('my_theme', my_theme)
# enable the newly registered theme
alt.themes.enable('my_theme')
Initial View: US-wide HIV occurrence and associated funding
We will first examine HIV incidence across the US, including in relation to current program funding levels.
For funding, I will look at the Health Resources and Services Administration’s (HRSA) Ryan White HIV/AIDS Program, which is the largest federal program focused specifically on providing HIV care and treatment services to low-income people living with HIV who are uninsured or underserved. This is accomplished through funding grants to cities/counties, states, and local community-based organizations.
The legislation was first enacted in 1990, and has been updated four times in response to changing needs, to best serve hard-to-reach populations. I will use this as a proxy in considering how funding aligns to occurence, with a particular regard to most disadvantaged populations.
Chart 1: Examination of the distribution of funding across states and territories.
#Exploratory chart showing the allocation of funding by state
base = alt.Chart(rw_funding).mark_bar().encode(
alt.X("Total Part B Funding:Q", bin=True, axis=None),
y='count()'
).properties(title={"text":["Funding for HIV services concentrated around $0-20M", "per state/territory, with long tail"],"subtitle":
["Total Allocations in 2019 as per the Ryan White Act in grants to states and territories", "to improve the quality, availability and organization of HIV health care and support services",
"The red line denotes mean funding amount"]}, width=325, height=300)
mean = base.mark_rule(color='#DC143C').encode(x='mean(Total Part B Funding):Q',size=alt.value(5))
merge_use = merge_fp[merge_fp['quartile'] != "middle"]
top_chart = alt.Chart(merge_use[merge_use['quartile'] == 'top_10']).mark_bar(size=37).encode(
y = alt.X('ent Name', title = 'State or Territory', sort = '-x'),
x = alt.Y('Total Part B Funding', title="Total Part B Funding, USD"),
color = alt.Color('ent Name:N', legend=None)
).properties(title={"text":["CA, NY, FL, TX,GA, and IL receive largest funding amounts "],
"subtitle":["Total Allocations as per the Ryan White Act, for the top 10% of funding recipients"]}, width=325, height=300)
alt.HConcatChart(hconcat=[base+mean, top_chart])
Data source: https://hab.hrsa.gov/about-ryan-white-hivaids-program/part-b-grants-states-territories, Department of Health Services and Administration
Citation for getting mean on histogram: https://altair-viz.github.io/gallery/histogram_with_a_global_mean_overlay.html
The left (full) graph depicts the total Part B funding allocation by state or territory for the Ryan White Act in a histogram, with the red line indicating the mean funding amount. As described above, this provides grant funding for states to administer HIV services and treatment. There are several subcomponents of this allocation, which are based on different measures such as reported number of people living with HIV in the preceding year; minority/emerging community representation; and other measures of demonstrated need. This graph illustrates that the majority of states/territories receive 0 -$20M of funding, with more dispersion in higher amounts. On the right chart, when we double click in, we see the states on the right of the histogram receiving highest funding include CA, NY, FL, TX, Georgia, and Illinois. Even among these, CA receives almost four times the funding amount to provide HIV services than IL. In sum, this analysis allows us to understand who the "highest" and "lowest" recipients are, which we can return to when considering prevalence of new diagnoses. Of course, population plays a role, so we will investigate in the following chart the relationship between funding and per capita funding.
Chart 2: Examination of relationship between overall and per capita funding by state/territory
#Shorten labels to make easier to display associated state/territory in the below chart:
state_abbrev = pd.read_csv('drive/My Drive/Data Visualization/us_states.csv')
merge_fp = pd.merge(merge_fp, state_abbrev, left_on="ent Name", right_on="l_name")
#Exploratory chart showing the allocation of per capita funding by state
#Citation for code consulted to get the marks to appear: https://altair-viz.github.io/gallery/scatter_with_labels.html
#Create a column with only the labels for states which are outliers
merge_fp['label_col'] = np.where(merge_fp.abbrev.isin(['DC', 'PR', 'NY','FL', 'TX','CA', 'GA','MD', 'MS', 'SC','IL']), merge_fp['abbrev'], '')
#Exploratory chart showing the allocation of per capita funding by state
#Citation for code consulted to get the marks to appear: https://altair-viz.github.io/gallery/scatter_with_labels.html
#Create a column with only the labels for states which are outliers
merge_fp['label_col'] = np.where(merge_fp.abbrev.isin(['DC', 'PR', 'NY','FL', 'TX','CA', 'GA','MD', 'MS', 'SC','IL']), merge_fp['abbrev'], '')
points = alt.Chart(merge_fp).mark_circle().encode(
x = alt.X('Total Part B Funding', title = 'Total Funding level, USD', sort = '-y'),
y = alt.Y('funding_pc', title = 'Per Capita Funding, USD'),
color = alt.Color('quartile', legend=alt.Legend(title='Funding Quantile'))
).properties(title={"text":["Slight positive trend in per capita funding relative to total funding, but some outliers"],
"subtitle":["Total Part B Allocations as per the Ryan White Act in grants to states by territory, compared to the amount adjusted per capita"]},
width=400, height=400)
text = points.mark_text(
align='left',
baseline='middle',
dx=7).encode(
text= 'label_col')
points + text
Data source: https://hab.hrsa.gov/about-ryan-white-hivaids-program/part-b-grants-states-territories, Department of Health Services and Administration, as well as census.gov for the 2019 population levels
In the above chart, I sought to investigate whether the total allocations scaled along with the respective populations of the states. I apply color to reinforce which are the highest and lowest recipients. We can see from the chart that there are several outliers of states which receive high per capita funding relative to the total amount, which could imply that they qualify for certain special adjustments to base Part B grant funding (for instance, a higher disadvantaged populations). The clustering near the origin represents the large bucket in the initial histogram of states that receive funding around the mean. Further, normalizing for population, we can see that some of the states which, as per above, seemed to receive larger amounts of funding (for instance, Florida, New York, and California) receive similar per capita amounts to majority of states (albeit a bit higher); however, there are some territories which now appear as outliers on a per capita basis, such as Puerto Rico and DC which overall are not in the highest funding level bucket but do receive quite a high per capita amount.
Chart 3: Examination of HIV incidence rates across the US
#Read in a shapefile for the US
state_gdf = gpd.read_file('drive/My Drive/Data Visualization/cb_2018_us_state_500k.shp')
#Load in data with HIV information per state
hiv_18 = pd.read_csv('drive/My Drive/Data Visualization/cdc_hiv_18.csv')
state_gdf_merge = state_gdf.merge(hiv_18, left_on='NAME', right_on='area', how='inner')
#Pull out territories for visualization purposes
state_gdf_merge = state_gdf_merge[(state_gdf_merge['area'] != 'Guam') & (state_gdf_merge['area'] != 'American Samoa')]
#Transform data - citation for code consulted: https://www.districtdatalabs.com/altair-choropleth-viz/
choro_json_state = json.loads(state_gdf_merge.to_json())
choro_data_state = alt.Data(values=choro_json_state['features'])
alt.Chart(choro_data_state).mark_geoshape(stroke='black', strokeWidth=.25).encode(
color=alt.Color('properties.total_rate', type = 'quantitative', title = 'HIV diagnosis rate')).properties(
title={"text":["HIV diagnosis rate in the USA in 2018 varies by state, appearing lower in Midwest/North"],
"subtitle":["HIV diagnoses per 100,000 population by state"]},
projection = {'type': 'albersUsa'}, width =600, height = 400)
Data source: https://www.cdc.gov/hiv/pdf/library/reports/surveillance/cdc-hiv-surveillance-report-2018-vol-30.pdf, the CDC's HIV surveillance report; converted the page 114 to a .csv for use
The above chart looks at the HIV rate by state for 2018, the most recent data that would have preceded allocations for 2019. We can see high rates of HIV diagnoses in states such as Florida and Georgia, which correspond to the highest allocations of funding for example. Overall, we can see that rates appear highest in the Southeast, with some additional hotspots in the south/toward the west. The midwest appears to have less prevalence.
Chart 4: Double click into regional differences
source = hiv_18
stripplot = alt.Chart(source, width=100).mark_circle(size=20).encode(
x=alt.X(
'jitter:Q',
title=None,
axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
scale=alt.Scale(),
),
y=alt.Y('total_rate:Q', title = 'HIV diagnosis rate'),
color=alt.Color('region:N', legend=None),
column=alt.Column(
'region:N',
header=alt.Header(
labelAngle=-90,
titleOrient='top',
labelOrient='bottom',
labelAlign='right',
labelPadding=2,
title=None,
),
),
).transform_calculate(
# Generate Gaussian jitter with a Box-Muller transform
jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
).configure_facet(
spacing=0
).configure_view(
stroke=None
).properties(
title={"text":'HIV diagnosis rate in 2018 according to region suggests higher prevalence in Southeast',
"subtitle": 'HIV diagnoses per 100,000 population for each state or territory, grouped by region to show trends'})
stripplot
Data source: https://www.cdc.gov/hiv/pdf/library/reports/surveillance/cdc-hiv-surveillance-report-2018-vol-30.pdf, the CDC's HIV surveillance report; converted the page 114 to a .csv for use
Source for regional classification: https://www.nationalgeographic.org/maps/united-states-regions/
Leveraging commonly used classification for US regions (where other refers to US territories, e.g. Puerto Rico or the U.S. Virgin Islands), we can confirm the above hypothesis that the Southeast appears to have higher rates of HIV diagnoses - it both achieves the highest rate, but also the entire distribution appears to be shifted upward as compared to other regions. From the inverse perspective, the Midwest appears to be clustered at a lower rate. However, there is of course variation within regions, and in the remaining visuals we will further investigate NY, specifically NYC, as there is rich data available on demographics of diagnoses - and there is a higher than average diagnosis rate at 12.6 individuals per 100,000 population.
Moving on to exploring some of the NYC HIV dataset for remaining visuals
We will now focus on NYC, which as we can see from the preceding section is both a key recipient of funding and while not the highest, has a relatively high HIV diagnosis rate. The open city portal has interesting demographic detail that we will use to explore.
#New data source located + added for shape file: https://www1.nyc.gov/site/doh/data/health-tools/maps-gis-data-files-for-download.page
shape_file_loc = 'drive/My Drive/Data Visualization/UHF_42_DOHMH_2009.shp'
gdf = gpd.read_file(shape_file_loc)
#import the NYC HIV data
nyc_hiv = pd.read_csv('drive/My Drive/Data Visualization/DOHMH_HIV_AIDS_Annual_Report.csv')
#Clean so we are just looking at all genders, ages, races so as to get just one line per area, per year for the map
nyc_hiv_clean = nyc_hiv[(nyc_hiv['Gender'] == 'All') & (nyc_hiv['Race'] == 'All') & ((nyc_hiv['Age'] == 'All'))]
#Merge the "clean" dataset we will use with the shapefile for mapping
gdf_merge = gdf.merge(nyc_hiv_clean, left_on='UHF_NEIGH', right_on='UHF', how='inner')
#Grab just 2015 for now from this for the map
gdf_15 = gdf_merge[gdf_merge['Year'] == 2015]
#Creating a function to make identity coordinate maps
'''
Additional citations: https://altair-viz.github.io/user_guide/configuration.html#config-projection (
documentation used to get around coordinate system issues)
Medium article with sample graph, for which I leveraged code as a starting point:
https://medium.com/dataexplorations/creating-choropleth-maps-in-altair-eeb7085779a1
'''
def gen_map(geodata, color_column, title, subtitle, scale_title):
'''
Generates map based on assigned column
'''
# Add Base Layer
base = alt.Chart(geodata).mark_geoshape(
stroke='black',
strokeWidth=.25).encode(alt.Color(color_column,
type='quantitative',
title = scale_title)).properties(
width=800,
height=800, title={"text":[title],"subtitle": [subtitle]}).configure_projection(type='identity',
reflectY=True)
return base
Chart 5: Visualization of new HIV diagnoses by New York City neighborhood
choro_json = json.loads(gdf_15.to_json())
choro_data = alt.Data(values=choro_json['features'])
gen_map(choro_data,'properties.HIV diagnoses',
'New HIV Diagnoses in New York vary by and within borough','New HIV Diagnoses for all ages, races, and ethnicities in 2015 by neighborhood (as designated by the United Hospital Fund)', 'Number of diagnoses')
Data source: NYC's annual HIV-AIDS report https://catalog.data.gov/dataset/dohmh-hiv-aids-annual-report; shapefile for mapping the health neighborhoods https://www1.nyc.gov/site/doh/data/health-tools/maps-gis-data-files-for-download.page
The above chloropleth map shows new HIV diagnoses for the most recent year of data, 2015, by neighborhood boundaries according to the United Hospital Fund's delineation (which is how it is set forth in the report). We can see that the neighborhoods with the highest rate of new diagnoses include Bedford-Stuyvesant, Washington Heights, and West Queens. Areas such as Staten Island have much lower rates of HIV diagnoses. In future graphs, I will dig into what attribtues of these neighborhoods may distinguish such neighborhoods. Note that in examining the data, there was little differentiation among gender (predominately, it is men that are affected), thus I will elect to explore other demographic characteristics.
Chart 6: Investgation of racial composition of diagnoses, including neighborhoods with highest HIV rate
# Create overall view
hiv_all = nyc_hiv[(nyc_hiv['Borough'] == 'All') & (nyc_hiv['UHF'] == 'All')& (nyc_hiv['Age'] == 'All') & (nyc_hiv['Gender'] == 'Male')]
bars = alt.Chart(hiv_all[hiv_all['Race'] == 'All']).mark_bar(size=40, color='#DC143C').encode(
x=alt.X("Year:N", title="Year"),
y=alt.Y("HIV diagnoses:Q"),
)
line_1 = alt.Chart(hiv_all[hiv_all['Race'] != 'All']).mark_line(point=True).encode(
x=alt.X("Year:N", title="Year"),
y=alt.Y("HIV diagnoses:Q"),
color = alt.Color('Race', legend=alt.Legend(title='Race'))
).properties(title={"text":['Overall HIV diagnoses declining over time,', 'with rate varying by race'],
"subtitle": ["Annual HIV diagnoses for NYC and broken down by Race"]},
width = 400)
bars+line_1
#Look at the racial composition of neighborhood with highest number of new diagnoses, Bed Stuy, Crown Heights
bedstuy_hiv_race = nyc_hiv[(nyc_hiv['Borough'] == 'Brooklyn') & (nyc_hiv['UHF'] == 'Bedford Stuyvesant - Crown Heights')& (nyc_hiv['Age'] == 'All') & (nyc_hiv['Gender'] == 'Male') &
(nyc_hiv['Race'] != 'All')]
bedstuy = alt.Chart(bedstuy_hiv_race).mark_bar(size=25).encode(
x = alt.X('Year:O', title = 'Year'),
y = alt.Y("HIV diagnoses:Q", title = 'New HIV Diagnoses', stack = "normalize"),
color = alt.Color('Race', legend=None)
).properties(title={"text":['African Americans predominately being', 'diagnosed with HIV in Bed-Stuy'],"subtitle":
['Total number of new HIV diagnoses over time in', 'Bedford-Stuyvesant for males, by race/ethnicity']}, height=200, width = 200)
wash_hiv_race = nyc_hiv[(nyc_hiv['Borough'] == 'Manhattan') & (nyc_hiv['UHF'] == 'Washington Heights - Inwood')& (nyc_hiv['Age'] == 'All') & (nyc_hiv['Gender'] == 'Male') &
(nyc_hiv['Race'] != 'All')]
wash = alt.Chart(wash_hiv_race).mark_bar(size=25).encode(
x = alt.X('Year:O', title = 'Year'),
y = alt.Y("HIV diagnoses:Q", title = 'New HIV Diagnoses', stack = "normalize"),
color = alt.Color('Race', legend=None)
).properties( title={"text":['More even distribution of HIV diagnoses', 'by race in Washington Heights - Inwood'],"subtitle":
['Total number of new HIV diagnoses over time in', 'Washington Heights for males, by race/ethnicity']}, height=200, width = 200)
wq_hiv_race = nyc_hiv[(nyc_hiv['Borough'] == 'Queens') & (nyc_hiv['UHF'] == 'West Queens')& (nyc_hiv['Age'] == 'All') & (nyc_hiv['Gender'] == 'Male') &
(nyc_hiv['Race'] != 'All')]
wq = alt.Chart(wq_hiv_race).mark_bar(size=25).encode(
x = alt.X('Year:O', title = 'Year'),
y = alt.Y("HIV diagnoses:Q", title = 'New HIV Diagnoses', stack = "normalize"),
color = alt.Color('Race', legend=None)
).properties( title={"text":['Latinos predominately being', 'diagnosed with HIV in West Queens'],"subtitle":
['Total number of new HIV diagnoses over time in', 'West Queens for males, by race/ethnicity']}, height=200, width = 200)
alt.HConcatChart(hconcat=[bedstuy, wash, wq])
Data source: NYC's annual HIV-AIDS report https://catalog.data.gov/dataset/dohmh-hiv-aids-annual-report
The above chart displays the total number of HIV diagnoses over time, with a racial breakdown. We can see that there has been an overall decrease in HIV diagnoses - but for some groups, notably Asian and other, the number remains low and fairly constant over time. We see bigger declines in the White and Black groups, followed by Hispanic Latino. When we double-click into the neighborhoods with the highest amount of new diagnoses - namely, the Bed-Stuy/Crown Heights neighborhood of Brooklyn, which as per above had the highest number of new diagnoses, along with West Queens and Washington Heights - we see the same overarching story but different racial patterns within neighborhoods. The vast majority of cases were of African Americans, followed by Hispanic Latino in Bed-Stuy. If we were to toggle and look at other high-diagnosis neighborhoods in NYC, there is similarly a higher representation of minorities, notably Hispanic/Latino in these two neighborhoods. Overall, in all three neighborhoods, the proportion of Latinos among total new infections has appeared to increase, which is interesting as despite a 2014 - 2015 jump, this population otherwise sees a decline over the period.
Chart 7: Consideration of age and its prevalence in the highest diagnosis neighborhoods
#Look at the gender composition of neighborhood with highest number of new diagnoses: Bed Stuy, Crown Heights, West Queens
bedstuy_hiv_age = nyc_hiv[(nyc_hiv['Borough'] == 'Brooklyn') & (nyc_hiv['UHF'] == 'Bedford Stuyvesant - Crown Heights')& (nyc_hiv['Age'] != 'All') & (nyc_hiv['Gender'] == 'Male') &
(nyc_hiv['Race'] == 'All')]
bedstuy_g = alt.Chart(bedstuy_hiv_age).mark_area().encode(
x = alt.X('Year:O', title = 'Year'),
y = alt.Y("HIV diagnoses:Q", title = None, scale=alt.Scale(domain=[0, 60])),
color="Age:N",
row="Age:N"
).properties(title={"text":['HIV diagnoses by age for males in', 'Bedford Stuyvesant - Crown Heights'],"subtitle":
['Total number of new HIV diagnoses over time in', 'Bed-Stuy of males, by age']}, height=30, width=200)
wash_hiv_age = nyc_hiv[(nyc_hiv['Borough'] == 'Manhattan') & (nyc_hiv['UHF'] == 'Washington Heights - Inwood')& (nyc_hiv['Age'] != 'All') &
(nyc_hiv['Race'] == 'All')& (nyc_hiv['Gender'] == 'Male')]
wash_g = alt.Chart(wash_hiv_age).mark_area().encode(
x = alt.X('Year:O', title = 'Year'),
y = alt.Y("HIV diagnoses:Q", title = None, scale=alt.Scale(domain=[0, 60])),
color="Age:N",
row="Age:N"
).properties( title={"text":['HIV diagnoses by age for males in', 'Washington Heights - Inwood'],"subtitle":
['Total number of new HIV diagnoses over time in', 'Washington Heights of males, by age']}, height=30, width=200)
wq_hiv_age = nyc_hiv[(nyc_hiv['Borough'] == 'Queens') & (nyc_hiv['UHF'] == 'West Queens')& (nyc_hiv['Age'] != 'All') & (nyc_hiv['Gender'] == 'Male') &
(nyc_hiv['Race'] == 'All')]
wq_g = alt.Chart(wq_hiv_age).mark_area().encode(
x = alt.X('Year:O', title = 'Year'),
y = alt.Y("HIV diagnoses:Q", title = None, scale=alt.Scale(domain=[0, 60])),
color="Age:N",
row="Age:N"
).properties(title={"text":['HIV diagnoses by age for males in', 'West Queens'],"subtitle":
['Total number of new HIV diagnoses over time in', 'West Queens of males, by age']}, height=30, width=200)
alt.HConcatChart(hconcat=[bedstuy_g, wash_g, wq_g])
Data source: NYC's annual HIV-AIDS report https://catalog.data.gov/dataset/dohmh-hiv-aids-annual-report
If we continue to look at the highest diagnosis areas, overall, there does not seem to be a ton of differentiation in terms of the age distribution of new infections. When we consider age, we can see across the three neighborhoods with high HIV rates, 20-29 year olds comprise the highest rate of new infections. In West Queens, the subsequent 30-39 year old age group is higher than the other two. In sum, new infections seem to be concentrated in the "milennial" group.
Chart 8: Investigation into HIV-related death rates vs. non-HIV related death rates
hiv_cut = nyc_hiv[(nyc_hiv['Borough'] != 'All') & (nyc_hiv['UHF'] == 'All')& (nyc_hiv['Age'] == 'All') & (nyc_hiv['Gender'] == 'Male') & (nyc_hiv['Race'] != 'All') & (nyc_hiv['Year'] != 2015)]
hiv_cut = hiv_cut.replace(99999, np.nan)
alt.Chart(hiv_cut).mark_circle().encode(
alt.X('HIV-related death rate', scale=alt.Scale(zero=False)),
alt.Y('Non-HIV-related death rate', scale=alt.Scale(zero=False, padding=1)),
color='Race',
size='HIV diagnoses'
).properties(title={"text":['General positive association between HIV and non-HIV related death rates among different NYC population groups'],"subtitle":
['HIV and non-HIV related death rates, scaled by number of diagnoses, by race']}, width=600, height=400)
In the above chart, we dig deeper into the relationship between race and HIV in NYC, as there was more differentation than when considering attributes such as age and gender. Here, we look at the HIV-related death rate and non-HIV related death rate among different races at the borough level, for all years that it is possible to gather data from (up until 2014). The goal here would be to see if there is a clustering of certain outcomes based on race, with size acting to indicate the severity of the problem. Overall, the two rates tend to increase in parallel, though we can see some outliers in both directions - e.g. higher HIV related death rate to regular or vice versa). The outliers tend to be in groups with a smaller number of diagnoses, indicating the potential that this has more to do with sample size. Finally, it is worthwhile to note that as one might expect, the death rate is lower for populations such as White as compared to Black, which could have to do with socioeconomic status.